package cn.doitedu.ml.regression

import cn.doitedu.commons.util.SparkUtil
import org.apache.log4j.{Level, Logger}
import cn.doitedu.ml.util.VecUtil.arr2Vec
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.{LinearRegression, RandomForestRegressor}

/**
 * @date: 2020/2/23
 * @site: www.doitedu.cn
 * @author: hunter.d 涛哥
 * @qq: 657270652
 * @description: 线性回归，房价预测，示例程序
  *     线性回归算法，在项目中可以用在点击率预测上
  *
 */
object LinearRegressionDemo {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkUtil.getSparkSession(this.getClass.getSimpleName)
    import spark.implicits._

    // 加载样本，（分清楚哪些是特征，哪个是结果）
    val sample = spark.read.option("header","true").option("inferSchema","true").csv("userprofile/data/demo/house/h.csv")
    sample.printSchema()
    sample.show(100,false)

    // 将样本数据特征向量化=》 （....，特征向量，label）
    spark.udf.register("arr2vec",arr2Vec)
    val train = sample.selectExpr("arr2vec(array(area,floor)) as features","price")

    // 1. 构造一个线性回归算法工具
    val linear = new LinearRegression()
        .setFeaturesCol("features")
        .setLabelCol("price")

    // 训练模型
    val linear_model = linear.fit(train)

    // 用模型对未知数据预测
    val predict = linear_model.transform(train)
    predict.show(100,false)


    // 2. 构造一个随机森林回归算法工具
    val random = new RandomForestRegressor()
      .setFeaturesCol("features")
      .setLabelCol("price")
    val random_model = random.fit(train)
    val predict2 = random_model.transform(train)
    predict2.show(100,false)


    // 回归效果评估，用指标：均方根误差  rmse
    /**
      * 评估指标
      * "rmse" => metrics.rootMeanSquaredError
      * "mse" => metrics.meanSquaredError
      * "r2" => metrics.r2
      * "mae" => metrics.meanAbsoluteError
      */
    val evaluator = new RegressionEvaluator()
        .setPredictionCol("prediction")
        .setLabelCol("price")
        .setMetricName("rmse")

    // 评估线性回归的效果
    val rmse1 = evaluator.evaluate(predict)
    println(rmse1)


    // 评估随机森林回归的效果
    val rmse2 = evaluator.evaluate(predict2)
    println(rmse2)


    // 对测试数据（未参与模型训练的样本）进行预测
    val testSample = spark.read.option("header","true").option("inferSchema","true").csv("userprofile/data/demo/house/t.csv")
    val test = testSample.selectExpr("arr2vec(array(area,floor)) as features","price")

    val testPredict1 = linear_model.transform(test)
    testPredict1.show(100,false)

    val testPredict2 = random_model.transform(test)
    testPredict2.show(100,false)

    val rmse_test1 = evaluator.evaluate(testPredict1)
    val rmse_test2 = evaluator.evaluate(testPredict2)

    println(rmse_test1)
    println(rmse_test2)

    spark.close()

  }

}
